Introduction

In this kernel we will be examining prices rates in King county,USA and how the prices vary by (location,living space,number of bedrooms,etc…),we will explore our data and try to find patterns ,This kernel will include a map that contain the distribution of the houses and their prices,and we will create a model that can predict the price of the houses.

libraries

library(caTools)
library(plotly)
library(randomcoloR)
library(tidyverse)
library(psych)
library(ggpubr)
library(wesanderson)
library(backports)
library(lubridate)
library(ggplot2)
library(moonBook)
library(mycor)
library(ggcorrplot)
library(scales)
library(leaflet)
library(wesanderson)
library(randomForest)
library(caret)
library(infer)
houses<- read.csv("E:/test files/kc_house_data.csv", stringsAsFactors=TRUE)

Exploring The Data

Columns of Data

1-id = Identify number

2-price = House price in dollar

3-bedrooms = Count of bedrooms

4-bathrooms = Count of bathrooms

5-sqft_living = Living space

6-sqft_lot = Square footage of house on land

7-floors= Count of floors

8-waterfront = House on the seaside or not (1/0)

9-view = View point of house (0 - 4)

10-condition = Conditions point of house (0 - 5)

11-grade = Point of house (1 - 13)

12-sqft_above = Square footage of the above ground

13-sqft_basement = Square footage of the below ground

14-yr_built = The year of Hous was build

15-yr_renovated = The year of House was renovate

16-zipcode = Zipcode of house

17-lat = Lattitude

18-long = Longitude

19-sqft_living15 = Living space in houses,they were sold in 2015

20-sal_year = The year house was sold

21-sqft_lot15 = Square footage of house on land,the houses sold in 2015

head(houses)
##           id            date   price bedrooms bathrooms sqft_living sqft_lot
## 1 7129300520 20141013T000000  221900        3      1.00        1180     5650
## 2 6414100192 20141209T000000  538000        3      2.25        2570     7242
## 3 5631500400 20150225T000000  180000        2      1.00         770    10000
## 4 2487200875 20141209T000000  604000        4      3.00        1960     5000
## 5 1954400510 20150218T000000  510000        3      2.00        1680     8080
## 6 7237550310 20140512T000000 1225000        4      4.50        5420   101930
##   floors waterfront view condition grade sqft_above sqft_basement yr_built
## 1      1          0    0         3     7       1180             0     1955
## 2      2          0    0         3     7       2170           400     1951
## 3      1          0    0         3     6        770             0     1933
## 4      1          0    0         5     7       1050           910     1965
## 5      1          0    0         3     8       1680             0     1987
## 6      1          0    0         3    11       3890          1530     2001
##   yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1            0   98178 47.5112 -122.257          1340       5650
## 2         1991   98125 47.7210 -122.319          1690       7639
## 3            0   98028 47.7379 -122.233          2720       8062
## 4            0   98136 47.5208 -122.393          1360       5000
## 5            0   98074 47.6168 -122.045          1800       7503
## 6            0   98053 47.6561 -122.005          4760     101930
str(houses)
## 'data.frame':    21613 obs. of  21 variables:
##  $ id           : num  7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
##  $ date         : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
##  $ price        : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
##  $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
##  $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
##  $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
##  $ yr_renovated : int  0 1991 0 0 0 0 0 0 0 0 ...
##  $ zipcode      : int  98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
##  $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
##  $ sqft_lot15   : int  5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
describe(houses)
##               vars     n          mean           sd       median       trimmed
## id               1 21613 4580301520.86 2.876566e+09  3.90493e+09 4500014357.18
## date*            2 21613        178.30 1.095000e+02  1.68000e+02        176.64
## price            3 21613     540088.14 3.671272e+05  4.50000e+05     481704.02
## bedrooms         4 21613          3.37 9.300000e-01  3.00000e+00          3.34
## bathrooms        5 21613          2.11 7.700000e-01  2.25000e+00          2.07
## sqft_living      6 21613       2079.90 9.184400e+02  1.91000e+03       1984.40
## sqft_lot         7 21613      15106.97 4.142051e+04  7.61800e+03       8259.53
## floors           8 21613          1.49 5.400000e-01  1.50000e+00          1.45
## waterfront       9 21613          0.01 9.000000e-02  0.00000e+00          0.00
## view            10 21613          0.23 7.700000e-01  0.00000e+00          0.00
## condition       11 21613          3.41 6.500000e-01  3.00000e+00          3.30
## grade           12 21613          7.66 1.180000e+00  7.00000e+00          7.58
## sqft_above      13 21613       1788.39 8.280900e+02  1.56000e+03       1682.94
## sqft_basement   14 21613        291.51 4.425800e+02  0.00000e+00        205.25
## yr_built        15 21613       1971.01 2.937000e+01  1.97500e+03       1973.10
## yr_renovated    16 21613         84.40 4.016800e+02  0.00000e+00          0.00
## zipcode         17 21613      98077.94 5.351000e+01  9.80650e+04      98074.72
## lat             18 21613         47.56 1.400000e-01  4.75700e+01         47.57
## long            19 21613       -122.21 1.400000e-01 -1.22230e+02       -122.23
## sqft_living15   20 21613       1986.55 6.853900e+02  1.84000e+03       1914.07
## sqft_lot15      21 21613      12768.46 2.730418e+04  7.62000e+03       7903.21
##                        mad        min           max        range  skew kurtosis
## id            3.561991e+09 1000102.00 9900000190.00 9.899000e+09  0.24    -1.26
## date*         1.438100e+02       1.00        372.00 3.710000e+02  0.15    -1.27
## price         2.223900e+05   75000.00    7700000.00 7.625000e+06  4.02    34.57
## bedrooms      1.480000e+00       0.00         33.00 3.300000e+01  1.97    49.05
## bathrooms     7.400000e-01       0.00          8.00 8.000000e+00  0.51     1.28
## sqft_living   8.006000e+02     290.00      13540.00 1.325000e+04  1.47     5.24
## sqft_lot      3.881450e+03     520.00    1651359.00 1.650839e+06 13.06   284.98
## floors        7.400000e-01       1.00          3.50 2.500000e+00  0.62    -0.49
## waterfront    0.000000e+00       0.00          1.00 1.000000e+00 11.38   127.59
## view          0.000000e+00       0.00          4.00 4.000000e+00  3.40    10.89
## condition     0.000000e+00       1.00          5.00 4.000000e+00  1.03     0.53
## grade         1.480000e+00       1.00         13.00 1.200000e+01  0.77     1.19
## sqft_above    6.671700e+02     290.00       9410.00 9.120000e+03  1.45     3.40
## sqft_basement 0.000000e+00       0.00       4820.00 4.820000e+03  1.58     2.71
## yr_built      3.410000e+01    1900.00       2015.00 1.150000e+02 -0.47    -0.66
## yr_renovated  0.000000e+00       0.00       2015.00 2.015000e+03  4.55    18.69
## zipcode       6.227000e+01   98001.00      98199.00 1.980000e+02  0.41    -0.85
## lat           1.600000e-01      47.16         47.78 6.200000e-01 -0.49    -0.68
## long          1.500000e-01    -122.52       -121.32 1.200000e+00  0.88     1.05
## sqft_living15 6.078700e+02     399.00       6210.00 5.811000e+03  1.11     1.60
## sqft_lot15    3.713910e+03     651.00     871200.00 8.705490e+05  9.51   150.71
##                        se
## id            19566662.38
## date*                0.74
## price             2497.23
## bedrooms             0.01
## bathrooms            0.01
## sqft_living          6.25
## sqft_lot           281.75
## floors               0.00
## waterfront           0.00
## view                 0.01
## condition            0.00
## grade                0.01
## sqft_above           5.63
## sqft_basement        3.01
## yr_built             0.20
## yr_renovated         2.73
## zipcode              0.36
## lat                  0.00
## long                 0.00
## sqft_living15        4.66
## sqft_lot15         185.73

we can remove the month and day because i think its not important in our job

houses$date<-gsub("T000000"," ",houses$date)
houses$date<-year(ymd(houses$date))

remove unwanted columns

houses<- select(houses,-id,-zipcode)

Data Visualization and Analysis

Correlations Between Variables

corr <- cor(select(houses,-long,-lat,-date))
corr.plot <- ggcorrplot(corr, type = "lower", outline.col = "white" ,lab = TRUE,lab_size = 1)+
  labs(title = "Data Correlation")
ggplotly(corr.plot,label=style)

Price Distribution

gghistogram(houses$price,fill ="skyblue",bins =60,title =" Price Distrbuation")+
scale_x_continuous(labels = label_number_si())

gghistogram(houses$price,fill="skyblue",bins =150,title =" Price Distrbuation < 1M",interactive=TRUE)+
  scale_x_continuous(labels = scales::comma)+
  scale_x_continuous(labels = scales::dollar)+
  coord_cartesian(x=c(0,1000000))

Variables Affecting The Price

Bedrooms It is obvious that more bedrooms means higher price but this plot show that 11 bedrooms price less than 7 ,that because the effect of other variables.

houses$bedrooms<-factor(houses$bedrooms)
houses %>% filter(!(bedrooms=="30"|bedrooms=="33")) %>%
  group_by(bedrooms) %>% summarise(mean=mean(price)) %>% 
  ggbarplot(x="bedrooms",y="mean",fill = "bedrooms",palette = "Set3",size = 1.5)+
  scale_y_continuous(labels = scales::dollar,n.breaks = 15)+
  theme(legend.position = "none")+
  labs(title = " Price Average By Bedrooms")

Grades

houses$grade<-factor(houses$grade)
houses %>% group_by(grade) %>% summarise(mean=mean(price)) %>% 
ggbarplot(houses,x="grade",y="mean",fill = "grade",palette = randomColor(count=12,luminosity = "light"),size = 1.5)+
  scale_y_continuous(labels = scales::dollar,n.breaks = 15)+
  theme(legend.position = "none")+
  labs(title = "Price Average by grade") 

View

houses$view<-factor(houses$view)
houses %>% group_by(view) %>% summarise(mean=mean(price)) %>% 
  ggbarplot(x="view",y="mean",fill="view",palette = wes_palette("Darjeeling1"),size = 1.5)+
  theme(legend.position = "none")+scale_y_continuous(labels = dollar,n.breaks = 15)+
  labs(title = " Price Average By View")

Floors

houses$floors<-round(houses$floors) %>% factor()
houses %>%  group_by(floors) %>% summarise(mean=mean(price)) %>% 
ggbarplot(houses,x="floors",y="mean",fill = "floors",palette = wes_palette("BottleRocket2"),size = 1.5)+
  scale_y_continuous(n.breaks = 10,labels = label_dollar())+
  labs(title = "Price Average By Floors Number")+
  theme(legend.position = "none")

living Space

ggscatter(houses,"sqft_living","price",color = "purple",size=0.5,alpha=0.5)+ylim(70000,2000000)+geom_smooth(method="lm",size=1.2,col="darkred",size=2)+xlim(0,5000)+labs(x = 'Tooal living space', y = 'Price (USD)',title = "Price By living Space")+scale_y_continuous(labels = label_dollar(),n.breaks = 10)+coord_cartesian(y=c(0,2000000))

Year Solid

houses$date<-factor(houses$date)
houses %>% group_by(date) %>% summarise(mean=mean(price))
## # A tibble: 2 x 2
##   date     mean
##   <fct>   <dbl>
## 1 2014  539181.
## 2 2015  541989.

Year Build

houses$yearb<-cut(houses$yr_built,c(1900,1950,2000,2020))
houses$yearb<-factor(houses$yearb,levels = c("(1.9e+03,1.95e+03]","(1.95e+03,2e+03]","(2e+03,2.02e+03]"),labels = c("1900-1950","1950-2000","2000-2020"))
houses %>% group_by(yearb) %>% summarise(mean=mean(price,na.rm = TRUE)) %>%filter(!is.na(yearb)) %>% 
ggbarplot("yearb","mean",fill="yearb",col="black",size = 2)+labs(title = "Price Average By Year Built",x="Year Built",y="Praice Average")+scale_y_continuous(labels = dollar,n.breaks = 10)+theme(legend.position = "none")

Map

houses$PriceBin<-cut(houses$price, c(0,250000,500000,750000,1000000,2000000,99900000))
houses$PriceBin<-factor(houses$PriceBin,levels = c("(0,2.5e+05]","(2.5e+05,5e+05]","(5e+05,7.5e+05]","(7.5e+05,1e+06]","(1e+06,2e+06]","(2e+06,9.99e+07]"),labels = c("0 - 250K $","250K $ - 500K $","500K $ - 750K $","750K $ - 1M $","1M $ - 2M $","2M $ - 10M $"))
center_lon = median(houses$long,na.rm = TRUE)
center_lat = median(houses$lat,na.rm = TRUE)


factpal <- colorFactor(c("red","blue","yellow","orange","#0B5345","black"), 
                       houses$PriceBin)

labels <- sprintf(
  "<strong>%s</strong><br/>%s",
  'Price: ',dollar(houses$price),'Bedrooms', houses$bedrooms
) %>% lapply(htmltools::HTML)

leaflet(houses) %>% addProviderTiles("Esri.NatGeoWorldMap")  %>% 
  addCircles(lng = ~long, lat = ~lat 
             ,color = ~factpal(PriceBin),label =labels,popup = paste0('Bedrooms: ',houses$bedrooms,',   Living Space: ',houses$sqft_living))  %>%
  setView(lng=center_lon, lat=center_lat,zoom = 12) %>% 
  
  addLegend("bottomright", pal = factpal, values = ~PriceBin,
            title = "House Price Distribution",
            opacity = 1)

The Model (Random Forest)

First we need to remove some variables

houses1<- read.csv("E:/test files/kc_house_data.csv", stringsAsFactors=TRUE)
houses1 = dplyr::select(houses1,-id,-date,-yr_renovated,-zipcode)

Now split our data into train and test data

sample=sample.split(houses1$price,SplitRatio=0.75)
train=subset(houses1,sample==T)
test=subset(houses1,sample==F)

Create the model

model=randomForest(price~.,train)
model
## 
## Call:
##  randomForest(formula = price ~ ., data = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 19006646713
##                     % Var explained: 87.03

We see that the model can explain 87% of our data which is good

Test the model

predict=predict(model,test[,-1])
postResample(test$price,predict)
##         RMSE     Rsquared          MAE 
## 1.059046e+05 8.771086e-01 6.463886e+04

The accuracy of the model is 87%

Improving The Model

We did some changes in our data so that we improve the model,for example we converted the year built from numeric to factors and convert basement space to (true,false) factor,and some rounding . and resampling.

houses1$yearb<-cut(houses$yr_built,c(1900,1950,2000,2020))
houses1$yearb<-factor(houses1$yearb,levels = c("(1.9e+03,1.95e+03]","(1.95e+03,2e+03]","(2e+03,2.02e+03]"),labels = c("1900-1950","1950-2000","2000-2020"))
houses1=houses1 %>% dplyr::select(-yr_built) 
houses1$bathrooms=round(houses1$bathrooms)
houses1$floors<-round(houses1$floors)
houses1<-houses1 %>% filter(!is.na(yearb))
houses1$basement<-ifelse(houses1$sqft_basement>0,1,0)
houses1<-houses1 %>% dplyr::select(-sqft_basement)
sample=sample.split(houses1$price,SplitRatio=0.75)
train=subset(houses1,sample==T)
test=subset(houses1,sample==F)
train=rep_sample_n(train,size =16884,reps = 3,replace = TRUE)
model=randomForest(price~.,train[,-1])
model
## 
## Call:
##  randomForest(formula = price ~ ., data = train[, -1]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 1594180771
##                     % Var explained: 98.92

The model improved by more than 10% ,very good improvement